'''
Created on Aug 9, 2013

@author: a.renduchintala
'''

import re
import pprint
import numpy as np

documents_by_tokens = {}
documents_by_topics = {}
all_tokens = {}



docs = open('../data/documents.txt', 'r').read().split('\n')
for doc in docs:
    num = doc.split('\t')[0]
    text = doc.split('\t')[1].lower()
    textstripped = re.sub("[^a-zA-Z0-9\s]", "", text.strip())  # text.replace()
    documents_by_tokens[num.strip()] = re.split('\s+', textstripped)
    all_tokens.update(dict([(x, 0) for x in re.split('\s+', textstripped)]))


num_topics = 3
num_documents = len(docs)
topic_given_document_distribition = np.matrix(np.ones((num_topics, num_documents)), np.float64)
topic_given_document_distribition = topic_given_document_distribition * (1.0 / num_topics)


'''
Assign random topics to each token in the documents_by_tokens
'''
